import numpy as np
import pandas as pd
from itertools import cycle
from scipy import interp
# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
# TensorFlow
import tensorflow as tf
# Timer
from timeit import default_timer as timer
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib import cm
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we demonstrate solving a classification problem in TensorFlow using Estimators using the Heart Disease Dataset from the UCI Machine Learning Repository.

Picture Source: harvard.edu
Developing a predictive model that can predict whether heart disease is present or absent based on the rest of the given features.
Data = np.genfromtxt('heart-disease/heart.dat', delimiter=' ')
Attributes = ['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure', 'Serum Cholestoral',
'Fasting Blood Sugar', 'Resting Electrocardiographic Results', 'Maximum Heart Rate Achieved',
'Exercise Induced Angina', 'Oldpeak', 'Slope',
'Number of Major Vessels', 'Thal', 'Heart Disease']
Data = pd.DataFrame(data = Data, columns = Attributes)
#
Temp = ['Sex', 'Chest Pain Type', 'Fasting Blood Sugar', 'Resting Electrocardiographic Results',
'Exercise Induced Angina', 'Slope', 'Number of Major Vessels','Thal']
for c in Temp:
Data[c] = Data[c].astype(int).astype(str)
del Temp, c
Target = 'Heart Disease'
Labels_dict = dict(zip([0,1],['Absent', 'Present']))
Data['Heart Disease'] = (Data['Heart Disease']-1).astype(int)
#
display(Data.head(5))
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
# Maps
Maps = {'Sex': {'0':'Female', '1':'Male'},
'Chest Pain Type': {'1':'Typical Angina', '2':'Atypical Angina', '3': 'Non-Anginal Pain', '4':'Asymptomatic'},
'Fasting Blood Sugar': {'0': 'False', '1': 'True'}, 'Exercise Induced Angina': {'0': 'No', '1': 'Yes'},
'Slope': {'1': 'Upsloping', '2': 'Flat', '3': 'Downsloping'},
'Thal': {'3': 'Normal', '6': 'Fixed Defect','7': 'Reversable Defect'}}
| Age | Sex | Chest Pain Type | Resting Blood Pressure | Serum Cholestoral | Fasting Blood Sugar | Resting Electrocardiographic Results | Maximum Heart Rate Achieved | Exercise Induced Angina | Oldpeak | Slope | Number of Major Vessels | Thal | Heart Disease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 70.0 | 1 | 4 | 130.0 | 322.0 | 0 | 2 | 109.0 | 0 | 2.4 | 2 | 3 | 3 | 1 |
| 1 | 67.0 | 0 | 3 | 115.0 | 564.0 | 0 | 2 | 160.0 | 0 | 1.6 | 2 | 0 | 7 | 0 |
| 2 | 57.0 | 1 | 2 | 124.0 | 261.0 | 0 | 0 | 141.0 | 0 | 0.3 | 1 | 0 | 7 | 1 |
| 3 | 64.0 | 1 | 4 | 128.0 | 263.0 | 0 | 0 | 105.0 | 1 | 0.2 | 2 | 1 | 7 | 0 |
| 4 | 74.0 | 0 | 2 | 120.0 | 269.0 | 0 | 2 | 121.0 | 1 | 0.2 | 1 | 1 | 3 | 0 |
| Number of Instances | Number of Attributes |
|---|---|
| 270 | 14 |
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
data_info = Data_Plot(Data, Title = 'Heart Disease Dataset', W = 800)
# A copy of the Dataframe
df = Data.copy()
# for TF
df.columns = [x.replace(' ','_') for x in df.columns]
Temp = Target.replace(' ','_')
X = df.drop(columns = Temp)
y = df[Temp].values
del df, Temp
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Pull = [0 for x in range((len(Labels_dict)-1))]
Pull.append(.05)
PD = dict(PieColors = ['SeaGreen','FireBrick'],
TableColors = ['Navy','White'], hole = .4,
column_widths=[0.6, 0.4],textfont = 14, height = 350, tablecolumnwidth = [0.20, 0.12, 0.15],
pull = Pull, legend_title = Target, title_x = 0.5, title_y = 0.8)
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
# For Tensorflow
X.columns = [x.replace(' ','_') for x in X.columns]
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict = Labels_dict):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= PD['column_widths'],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [ToSeries(y_train).replace(Labels_dict), ToSeries(y_test).replace(Labels_dict)]:
fig.add_trace(go.Pie(labels= list(Labels_dict.values()),
values= y.value_counts().values, pull=PD['pull'],
textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'],
line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole=.5)
fig.update_layout(legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + 'Dataset Distribution' + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD.update(dict(column_widths=[0.3, 0.3, 0.3], tablecolumnwidth = [0.2, 0.4], height = 350, legend_title = Target))
Train_Test_Dist(X_train, y_train, X_test, y_test, PD)
Create the feature columns, using the original numeric columns as is and one-hot-encoding categorical variables.
def Feat_Columns(Inp, Numeric = False, disp_dtype = False):
'''
Feature Columns function
Input: Dataset
Output: Tensorflow Feature Column List
'''
if not Numeric:
Numeric = ['int64', 'int32', 'float64', 'float32']
Temp = Inp.dtypes.reset_index(drop = False)
Temp.columns = ['Features', 'Data Type']
Temp['Data Type'] = Temp['Data Type'].astype(str)
# Numeric_Columns
Numeric_Columns = Temp.loc[Temp['Data Type'].isin(Numeric), 'Features'].tolist()
# Categorical_Columns
# Categorical_Columns = Temp.loc[(~Temp['Data Type'].isin(Numeric)), 'Features'].tolist()
Categorical_Columns = Temp.loc[Temp['Data Type'] == 'object','Features'].tolist()
if disp_dtype:
display(pd.DataFrame({'Numeric Columns': [', '.join(Numeric_Columns)],
'Categorical Columns': [', '.join(Categorical_Columns)]}, index = ['Columns']).T.style)
# Feature Columns
feature_columns = []
if len(Categorical_Columns)>0:
for feature_name in Categorical_Columns:
vocabulary = Inp[feature_name].unique()
feature_columns.append(tf.feature_column.indicator_column(\
tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)))
if len(Numeric_Columns)>0:
for feature_name in Numeric_Columns:
feature_columns.append(tf.feature_column.numeric_column(feature_name))
return feature_columns
The input function specifies how data is converted to a tf.data.Dataset that feeds the input pipeline in a streaming fashion. Moreover, an input function is a function that returns a tf.data.Dataset object which outputs the following two-element tuple:
def make_input_fn(X, y, inmemory_train = False, n_epochs= None, shuffle=True, batch_size = 256):
# Not In memory Training
if not inmemory_train:
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((X.to_dict(orient='list'), y))
if shuffle:
dataset = dataset.shuffle(1000)
dataset = (dataset.repeat(n_epochs).batch(batch_size))
return dataset
# In memory Training
if inmemory_train:
y = np.expand_dims(y, axis=1)
def input_fn():
return dict(X), y
# End
return input_fn
my_feature_columns = Feat_Columns(X)
# Training and evaluation input functions.
train_input_fn = make_input_fn(X_train, y_train)
eval_input_fn = make_input_fn(X_test, y_test, shuffle=False, n_epochs=1)
# Classifier
tf.keras.backend.clear_session()
IT = int(1e3)
params = {'n_trees': 50, 'max_depth': 3, 'n_batches_per_layer': 1, 'center_bias': True}
classifier = tf.estimator.BoostedTreesClassifier(my_feature_columns, **params)
# Train model.
start = timer()
classifier.train(train_input_fn, max_steps = IT)
CPU_Time = timer() - start
# Evaluation.
results = classifier.evaluate(eval_input_fn)
clear_output()
results['CPU Time'] = CPU_Time
display(pd.DataFrame(results, index = ['']).round(4))
| accuracy | accuracy_baseline | auc | auc_precision_recall | average_loss | label/mean | loss | precision | prediction/mean | recall | global_step | CPU Time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.8642 | 0.5556 | 0.9093 | 0.8667 | 0.3973 | 0.4444 | 0.3973 | 0.8205 | 0.4867 | 0.8889 | 178 | 5.3084 |
def ROC_Curve(y_test, probs, n_classes, FS = 7, ax = False, pad = 0.01):
# converting y_test to categorical
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=n_classes, dtype='float32')
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = metrics.roc_curve(y_test_cat[:, i], probs[:, i])
roc_auc[i] = metrics.auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test_cat.ravel(), probs.ravel())
roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = metrics.auc(fpr["macro"], tpr["macro"])
fig = go.Figure()
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], name = 'FPR = TPR', line = dict(color='Black', width=2, dash='dash')))
fig.add_trace(go.Scatter(x=fpr["micro"], y=tpr["micro"], mode='lines', marker_color = 'deeppink',
name='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"])))
fig.add_trace(go.Scatter(x=fpr["macro"], y=tpr["macro"], mode='lines', marker_color = 'navy',
name='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"])))
colors = cycle(['Aqua', 'DarkOrange', 'CornflowerBlue'])
for i, color in zip(range(n_classes), colors):
_ = fig.add_trace(go.Scatter(x = fpr[i], y = tpr[i], mode='lines', marker_color= px.colors.sequential.Rainbow,
name='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i])))
# Background
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range =[-pad, 1+pad],
title = 'False Positive Rate (FPR)')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range =[-pad, 1+pad],
title = 'True Positive Rate (TPR)')
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.update_layout(height = 600, width = 810)
fig.update_layout(title={'text': '<b>' + 'Receiver Operating Characteristic (ROC) Curves' + '<b>', 'x': .5,
'y': .9, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
pred_dicts = list(classifier.predict(eval_input_fn))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
ROC_Curve(y_test, probs, n_classes = len(Labels_dict), FS = 8)
We can investigate the feature importance of an artificial classification task. This is similar to that of scikit-learn and has been outlined in [6].
pred_dicts = list(classifier.experimental_predict_with_explanations(eval_input_fn))
clear_output()
# Create DFC Pandas dataframe.
labels = y_test
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])
df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts])
df_dfc.columns = [x.replace('_',' ') for x in df_dfc.columns]
display(df_dfc.describe().T.style.background_gradient(subset= ['mean'], cmap='RdYlGn')\
.background_gradient(subset= ['std'], cmap='RdYlGn')\
.background_gradient(subset= ['min'], cmap='hot')\
.background_gradient(subset= ['max'], cmap='winter')
.set_precision(4).format({'count': "{:.0f}"}))
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Number of Major Vessels | 81 | 0.0154 | 0.1060 | -0.1366 | -0.0683 | -0.0356 | 0.1020 | 0.2979 |
| Chest Pain Type | 81 | 0.0037 | 0.1053 | -0.2074 | -0.0824 | 0.0446 | 0.0867 | 0.1882 |
| Thal | 81 | 0.0090 | 0.0880 | -0.1044 | -0.0680 | -0.0488 | 0.0877 | 0.1807 |
| Slope | 81 | 0.0008 | 0.0719 | -0.1815 | -0.0582 | -0.0225 | 0.0602 | 0.1473 |
| Resting Blood Pressure | 81 | 0.0039 | 0.0628 | -0.1854 | -0.0341 | -0.0003 | 0.0284 | 0.2500 |
| Sex | 81 | 0.0186 | 0.0685 | -0.1188 | -0.0341 | 0.0286 | 0.0497 | 0.1455 |
| Maximum Heart Rate Achieved | 81 | 0.0033 | 0.0846 | -0.1506 | -0.0661 | 0.0058 | 0.0760 | 0.1988 |
| Exercise Induced Angina | 81 | 0.0030 | 0.0264 | -0.0766 | -0.0074 | -0.0042 | 0.0200 | 0.0713 |
| Oldpeak | 81 | -0.0059 | 0.0721 | -0.1866 | -0.0450 | -0.0201 | 0.0229 | 0.2642 |
| Serum Cholestoral | 81 | 0.0017 | 0.0618 | -0.1649 | -0.0354 | 0.0041 | 0.0292 | 0.2112 |
| Fasting Blood Sugar | 81 | -0.0004 | 0.0101 | -0.0454 | 0.0000 | 0.0004 | 0.0012 | 0.0301 |
| Resting Electrocardiographic Results | 81 | 0.0001 | 0.0114 | -0.0322 | -0.0008 | 0.0000 | 0.0004 | 0.0515 |
| Age | 81 | -0.0040 | 0.0607 | -0.1818 | -0.0357 | -0.0083 | 0.0326 | 0.2105 |
A nice property of DFCs is that the sum of the contributions + the bias is equal to the prediction for a given example.
# Sum of DFCs + bias == probabality.
bias = pred_dicts[0]['bias']
dfc_prob = df_dfc.sum(axis=1) + bias
np.testing.assert_almost_equal(dfc_prob.values, probs.values)
Plot DFCs for an individual patient which is color-coded based on the contributions' directionality and add the feature values on the figure.
def _add_feature_values(feature_values, ax, colors):
"""Display feature's values on left of plot."""
x_coord = ax.get_xlim()[0]
OFFSET = 0.15
for y_coord, (feat_name, feat_val) in enumerate(feature_values.items()):
t = plt.text(x_coord, y_coord - OFFSET, '{}'.format(feat_val), size=12)
t.set_bbox(dict(facecolor= colors[y_coord], alpha=0.25))
font = FontProperties()
# font.set_weight('bold')
t = plt.text(x_coord, y_coord + 1 - OFFSET, 'Feature\nValue', fontproperties=font, size=13)
def _yaxis_labels(ax):
y_labels = []
for c in [c.get_text() for c in ax.get_yticklabels()]:
List = list(mit.locate(c, lambda x: x == " "))
if len(List)>1:
List = List[1::2]
Temp1 = list(c)
for position in List:
Temp1[position] = '\n'
c = "".join(Temp1)
y_labels.append(c)
return y_labels
def _xLims(ax):
Temp = np.linspace(-1,1,21, endpoint=True)
Temp = np.round(Temp,1)
xlims = ax.get_xlim()
for l, r in list(zip(Temp[:-1],Temp[1:])):
if l<= xlims[0] < r:
Left = l
if l<= xlims[1] < r:
Right = r
return [Left, Right]
def Plot_Example(example, TOP_N = 10, Pos_Color = 'LimeGreen', Neg_Color = 'OrangeRed', Maps = Maps, FS = (13, 7)):
example.index = [x.replace('_',' ') for x in example.index]
# Sorting by absolute value
sorted_ix = example.abs().sort_values()[-TOP_N:].index
example = example[sorted_ix]
fig, ax = plt.subplots(1, 1, figsize= FS)
Temp = example.to_frame('Value').sort_index(ascending= False)
Temp0 = Temp.copy(); Temp0[Temp0 < 0] = np.nan
_ = Temp0.plot(kind='barh', color= Pos_Color, edgecolor = 'white', hatch = '///', legend=None, alpha=0.75, ax = ax)
Temp0 = Temp.copy(); Temp0[Temp0 >= 0] = np.nan
_ = Temp0.plot(kind='barh', color= Neg_Color, edgecolor = 'white', hatch = '///', legend=None, alpha=0.75, ax = ax)
_ = Temp.plot(kind='barh', color='None', edgecolor = 'Black', legend=None, alpha=1, lw=1.2, ax = ax)
del Temp, Temp0
_ = ax.grid(False, axis='y')
# Y axis Labels
# _ = ax.set_yticklabels(_yaxis_labels(ax), size=12)
# x axis Limits
_ = ax.set_xlim(_xLims(ax))
# Add feature values.
Temp = X_test.copy()
Temp.columns = [x.replace('_',' ') for x in Temp.columns]
for c in Maps.keys():
Temp[c] = Temp[c].map(Maps[c])
colors = example.map(lambda x: Pos_Color if x >= 0 else Neg_Color).tolist()
_add_feature_values(Temp.iloc[ID][sorted_ix], ax, colors)
return ax
# Plot results.
ID = 20
Tops = X_train.shape[1]
ax = Plot_Example(df_dfc.iloc[ID], TOP_N = Tops, FS = (13, 8))
_ = ax.set_title('Feature contributions for example patient {} from the Test set\n Pred: {:1.2f}; Label: {}'
.format(ID, probs[ID], labels[ID]))
_ = ax.set_xlabel('Contribution to Predicted Probability', size=14)
importances = classifier.experimental_feature_importances(normalize=True)
df_imp = pd.Series(importances)
Temp = df_imp.iloc[0:Tops][::-1].reset_index()
Temp.columns = ['Features','Importance']
Temp['Features'] = Temp['Features'].map(lambda x: x.replace('_',' '))
importances = classifier.experimental_feature_importances(normalize=True)
def Plot_FeatImportance(pds, TOP_N = 10, FS = (13, 6)):
Temp = pds.iloc[0:TOP_N][::-1].reset_index()
Temp.columns = ['Features','Importance']
Temp = Temp.sort_values(by=['Importance'])
Temp['Features'] = Temp['Features'].map(lambda x: x.replace('_',' '))
fig, ax = plt.subplots(1, 1, figsize = FS)
CP = sns.color_palette("RdYlGn", TOP_N)
_ = sns.barplot(ax = ax, x='Importance', y= 'Features', data= Temp, palette=CP, hatch = '//')
_ = sns.barplot(ax = ax, x='Importance', y= 'Features', data= Temp, facecolor = 'None', edgecolor = 'Indigo')
_ = ax.grid(False, axis='y')
# Y axis Labels
# _ = ax.set_yticklabels(_yaxis_labels(ax), size=12)
# x axis Limits
_ = ax.set_xlim(_xLims(ax))
return ax
ax = Plot_FeatImportance(pd.Series(importances), TOP_N = Tops, FS = (13, 7.5))
_ = ax.set_xlim(right = .16)
ax = Plot_FeatImportance(df_dfc.abs().mean(), TOP_N = Tops, FS = (13, 7.5))
_ = ax.set_xlim(right = .1)
def permutation_importances(est, X_eval, y_test, metric, features):
"""Column by column, shuffle values and observe effect on eval set.
source: http://explained.ai/rf-importance/index.html
A similar approach can be done during training. See "Drop-column importance"
in the above article."""
baseline = metric(est, X_eval, y_test)
imp = []
for col in features:
save = X_eval[col].copy()
X_eval[col] = np.random.permutation(X_eval[col])
m = metric(est, X_eval, y_test)
X_eval[col] = save
imp.append(baseline - m)
return np.array(imp)
def accuracy_metric(est, X, y):
"""TensorFlow estimator accuracy."""
eval_input_fn = make_input_fn(X,
y=y,
shuffle=False,
n_epochs=1)
return est.evaluate(input_fn=eval_input_fn)['accuracy']
features = X_train.columns.tolist()
importances = permutation_importances(classifier, X_test, y_test, accuracy_metric,
features)
df_imp = pd.Series(importances, index=features)
clear_output()
ax = Plot_FeatImportance(df_imp, TOP_N = Tops, FS = (13, 7.5))